knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(tidytext)
library(caret)
library(fastDummies)
library(randomForest)
library(tidymodels)
library(factoextra)
library(cluster)
library(plotly)
library(skimr)
# https://www.openml.org/d/1590
raw_income = read_csv("https://raw.githubusercontent.com/chanks06/ml-model3/main/datasets/openml_1590.csv", na=c("?"))
income = read_csv("https://raw.githubusercontent.com/chanks06/ml-model3/main/datasets/openml_1590.csv", na=c("?")) %>%
drop_na() %>%
mutate(income_above_50k = class==">50K") %>%
select(-class) %>%
dummy_cols(remove_selected_columns = T)
skim(income)
| Name | income |
| Number of rows | 45222 |
| Number of columns | 105 |
| _______________________ | |
| Column type frequency: | |
| logical | 1 |
| numeric | 104 |
| ________________________ | |
| Group variables | None |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| income_above_50k | 0 | 1 | 0.25 | FAL: 34014, TRU: 11208 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 38.55 | 13.22 | 17 | 28.0 | 37 | 47 | 90 | ▇▇▅▁▁ |
| fnlwgt | 0 | 1 | 189734.73 | 105639.20 | 13492 | 117388.2 | 178316 | 237926 | 1490400 | ▇▁▁▁▁ |
| education-num | 0 | 1 | 10.12 | 2.55 | 1 | 9.0 | 10 | 13 | 16 | ▁▁▇▃▁ |
| capital-gain | 0 | 1 | 1101.43 | 7506.43 | 0 | 0.0 | 0 | 0 | 99999 | ▇▁▁▁▁ |
| capital-loss | 0 | 1 | 88.60 | 404.96 | 0 | 0.0 | 0 | 0 | 4356 | ▇▁▁▁▁ |
| hours-per-week | 0 | 1 | 40.94 | 12.01 | 1 | 40.0 | 40 | 45 | 99 | ▁▇▃▁▁ |
| workclass_Federal-gov | 0 | 1 | 0.03 | 0.17 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| workclass_Local-gov | 0 | 1 | 0.07 | 0.25 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| workclass_Private | 0 | 1 | 0.74 | 0.44 | 0 | 0.0 | 1 | 1 | 1 | ▃▁▁▁▇ |
| workclass_Self-emp-inc | 0 | 1 | 0.04 | 0.19 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| workclass_Self-emp-not-inc | 0 | 1 | 0.08 | 0.28 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| workclass_State-gov | 0 | 1 | 0.04 | 0.20 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| workclass_Without-pay | 0 | 1 | 0.00 | 0.02 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_1st-4th | 0 | 1 | 0.00 | 0.07 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_5th-6th | 0 | 1 | 0.01 | 0.10 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_7th-8th | 0 | 1 | 0.02 | 0.13 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_9th | 0 | 1 | 0.01 | 0.12 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_10th | 0 | 1 | 0.03 | 0.16 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_11th | 0 | 1 | 0.04 | 0.19 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_12th | 0 | 1 | 0.01 | 0.11 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_Assoc-acdm | 0 | 1 | 0.03 | 0.18 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_Assoc-voc | 0 | 1 | 0.04 | 0.20 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_Bachelors | 0 | 1 | 0.17 | 0.37 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▂ |
| education_Doctorate | 0 | 1 | 0.01 | 0.11 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_HS-grad | 0 | 1 | 0.33 | 0.47 | 0 | 0.0 | 0 | 1 | 1 | ▇▁▁▁▃ |
| education_Masters | 0 | 1 | 0.06 | 0.23 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_Preschool | 0 | 1 | 0.00 | 0.04 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_Prof-school | 0 | 1 | 0.02 | 0.13 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| education_Some-college | 0 | 1 | 0.22 | 0.41 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▂ |
| marital-status_Divorced | 0 | 1 | 0.14 | 0.35 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| marital-status_Married-AF-spouse | 0 | 1 | 0.00 | 0.03 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| marital-status_Married-civ-spouse | 0 | 1 | 0.47 | 0.50 | 0 | 0.0 | 0 | 1 | 1 | ▇▁▁▁▇ |
| marital-status_Married-spouse-absent | 0 | 1 | 0.01 | 0.11 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| marital-status_Never-married | 0 | 1 | 0.32 | 0.47 | 0 | 0.0 | 0 | 1 | 1 | ▇▁▁▁▃ |
| marital-status_Separated | 0 | 1 | 0.03 | 0.17 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| marital-status_Widowed | 0 | 1 | 0.03 | 0.17 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Adm-clerical | 0 | 1 | 0.12 | 0.33 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Armed-Forces | 0 | 1 | 0.00 | 0.02 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Craft-repair | 0 | 1 | 0.13 | 0.34 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Exec-managerial | 0 | 1 | 0.13 | 0.34 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Farming-fishing | 0 | 1 | 0.03 | 0.18 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Handlers-cleaners | 0 | 1 | 0.05 | 0.21 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Machine-op-inspct | 0 | 1 | 0.07 | 0.25 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Other-service | 0 | 1 | 0.11 | 0.31 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Priv-house-serv | 0 | 1 | 0.01 | 0.07 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Prof-specialty | 0 | 1 | 0.13 | 0.34 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Protective-serv | 0 | 1 | 0.02 | 0.15 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Sales | 0 | 1 | 0.12 | 0.32 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Tech-support | 0 | 1 | 0.03 | 0.17 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| occupation_Transport-moving | 0 | 1 | 0.05 | 0.22 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| relationship_Husband | 0 | 1 | 0.41 | 0.49 | 0 | 0.0 | 0 | 1 | 1 | ▇▁▁▁▆ |
| relationship_Not-in-family | 0 | 1 | 0.26 | 0.44 | 0 | 0.0 | 0 | 1 | 1 | ▇▁▁▁▃ |
| relationship_Other-relative | 0 | 1 | 0.03 | 0.17 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| relationship_Own-child | 0 | 1 | 0.15 | 0.35 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▂ |
| relationship_Unmarried | 0 | 1 | 0.11 | 0.31 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| relationship_Wife | 0 | 1 | 0.05 | 0.21 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| race_Amer-Indian-Eskimo | 0 | 1 | 0.01 | 0.10 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| race_Asian-Pac-Islander | 0 | 1 | 0.03 | 0.17 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| race_Black | 0 | 1 | 0.09 | 0.29 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| race_Other | 0 | 1 | 0.01 | 0.09 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| race_White | 0 | 1 | 0.86 | 0.35 | 0 | 1.0 | 1 | 1 | 1 | ▁▁▁▁▇ |
| sex_Female | 0 | 1 | 0.32 | 0.47 | 0 | 0.0 | 0 | 1 | 1 | ▇▁▁▁▃ |
| sex_Male | 0 | 1 | 0.68 | 0.47 | 0 | 0.0 | 1 | 1 | 1 | ▃▁▁▁▇ |
| native-country_Cambodia | 0 | 1 | 0.00 | 0.02 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Canada | 0 | 1 | 0.00 | 0.06 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_China | 0 | 1 | 0.00 | 0.05 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Columbia | 0 | 1 | 0.00 | 0.04 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Cuba | 0 | 1 | 0.00 | 0.05 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Dominican-Republic | 0 | 1 | 0.00 | 0.05 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Ecuador | 0 | 1 | 0.00 | 0.03 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_El-Salvador | 0 | 1 | 0.00 | 0.06 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_England | 0 | 1 | 0.00 | 0.05 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_France | 0 | 1 | 0.00 | 0.03 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Germany | 0 | 1 | 0.00 | 0.07 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Greece | 0 | 1 | 0.00 | 0.03 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Guatemala | 0 | 1 | 0.00 | 0.04 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Haiti | 0 | 1 | 0.00 | 0.04 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Holand-Netherlands | 0 | 1 | 0.00 | 0.00 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Honduras | 0 | 1 | 0.00 | 0.02 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Hong | 0 | 1 | 0.00 | 0.02 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Hungary | 0 | 1 | 0.00 | 0.02 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_India | 0 | 1 | 0.00 | 0.06 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Iran | 0 | 1 | 0.00 | 0.04 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Ireland | 0 | 1 | 0.00 | 0.03 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Italy | 0 | 1 | 0.00 | 0.05 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Jamaica | 0 | 1 | 0.00 | 0.05 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Japan | 0 | 1 | 0.00 | 0.04 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Laos | 0 | 1 | 0.00 | 0.02 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Mexico | 0 | 1 | 0.02 | 0.14 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Nicaragua | 0 | 1 | 0.00 | 0.03 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Outlying-US(Guam-USVI-etc) | 0 | 1 | 0.00 | 0.02 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Peru | 0 | 1 | 0.00 | 0.03 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Philippines | 0 | 1 | 0.01 | 0.08 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Poland | 0 | 1 | 0.00 | 0.04 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Portugal | 0 | 1 | 0.00 | 0.04 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Puerto-Rico | 0 | 1 | 0.00 | 0.06 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Scotland | 0 | 1 | 0.00 | 0.02 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_South | 0 | 1 | 0.00 | 0.05 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Taiwan | 0 | 1 | 0.00 | 0.03 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Thailand | 0 | 1 | 0.00 | 0.03 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Trinadad&Tobago | 0 | 1 | 0.00 | 0.02 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_United-States | 0 | 1 | 0.91 | 0.28 | 0 | 1.0 | 1 | 1 | 1 | ▁▁▁▁▇ |
| native-country_Vietnam | 0 | 1 | 0.00 | 0.04 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| native-country_Yugoslavia | 0 | 1 | 0.00 | 0.02 | 0 | 0.0 | 0 | 0 | 1 | ▇▁▁▁▁ |
# Specify a recipe for pca
pca_rec <- recipe(~ ., data = income) %>%
update_role(income_above_50k, new_role = "ID") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors(), num_comp = 2, id = "pca")
# Print out recipe
pca_rec
# Estimate required statistcs
pca_estimates <- prep(pca_rec)
# Return preprocessed data using bake
features_2d <- pca_estimates %>%
bake(new_data = NULL)
# Print baked data set
features_2d %>%
slice_head(n = 5)
## # A tibble: 5 × 3
## income_above_50k PC1 PC2
## <lgl> <dbl> <dbl>
## 1 FALSE -2.31 2.63
## 2 FALSE 2.21 1.54
## 3 TRUE 2.29 -1.03
## 4 TRUE 1.05 1.30
## 5 FALSE -1.79 1.79
# Examine how much variance each PC accounts for
pca_estimates %>%
tidy(id = "pca", type = "variance") %>%
filter(str_detect(terms, "percent"))
## # A tibble: 208 × 4
## terms value component id
## <chr> <dbl> <int> <chr>
## 1 percent variance 4.38 1 pca
## 2 percent variance 2.98 2 pca
## 3 percent variance 2.51 3 pca
## 4 percent variance 2.26 4 pca
## 5 percent variance 1.87 5 pca
## 6 percent variance 1.71 6 pca
## 7 percent variance 1.64 7 pca
## 8 percent variance 1.52 8 pca
## 9 percent variance 1.38 9 pca
## 10 percent variance 1.34 10 pca
## # … with 198 more rows
theme_set(theme_light())
# Plot how much variance each PC accounts for
pca_estimates %>%
tidy(id = "pca", type = "variance") %>%
filter(terms == "percent variance") %>%
ggplot(mapping = aes(x = component, y = value)) +
geom_col(fill = "midnightblue", alpha = 0.7) +
ylab("% of total variance")
# Visualize PC scores
features_2d %>%
ggplot(mapping = aes(x = PC1, y = PC2)) +
geom_point(size = 2, color = "dodgerblue3")+
geom_jitter(alpha= 0.1)
capital-gain fnlwgt capital-loss
# Drop target column and normalize data
income_features<- recipe(~ ., data = income) %>%
step_rm(income_above_50k) %>%
step_naomit(everything(), skip = TRUE) %>%
step_dummy(all_nominal(), -all_outcomes()) %>% #converts our factor columns into numeric binary (0 and 1) variables.
step_zv(all_numeric(), -all_outcomes()) %>% ## step_zv(): removes any numeric variables that have zero variance.
prep() %>%
bake(new_data = NULL)
# Print out data
income_features %>%
slice_head(n = 5)
## # A tibble: 5 × 104
## age fnlwgt educati…¹ capit…² capit…³ hours…⁴ workc…⁵ workc…⁶ workc…⁷ workc…⁸
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int>
## 1 25 226802 7 0 0 40 0 0 1 0
## 2 38 89814 9 0 0 50 0 0 1 0
## 3 28 336951 12 0 0 40 0 1 0 0
## 4 44 160323 10 7688 0 40 0 0 1 0
## 5 34 198693 6 0 0 30 0 0 1 0
## # … with 94 more variables: `workclass_Self-emp-not-inc` <int>,
## # `workclass_State-gov` <int>, `workclass_Without-pay` <int>,
## # `education_1st-4th` <int>, `education_5th-6th` <int>,
## # `education_7th-8th` <int>, education_9th <int>, education_10th <int>,
## # education_11th <int>, education_12th <int>, `education_Assoc-acdm` <int>,
## # `education_Assoc-voc` <int>, education_Bachelors <int>,
## # education_Doctorate <int>, `education_HS-grad` <int>, …
One way we can try to find out is to use a data sample to create a series of clustering models with an incrementing number of clusters, and measure how tightly the data points are grouped within each cluster. A metric often used to measure this tightness is the within cluster sum of squares (WCSS), with lower values meaning that the data points are closer. You can then plot the WCSS for each model.
We’ll use the built-in kmeans() function, which accepts a data frame with all numeric columns as it’s primary argument to perform clustering - means we’ll have to drop the species column. For clustering, it is recommended that the data have the same scale. We can use the recipes package to perform these transformations.
set.seed(503)
# Create 10 models with 1 to 10 clusters
kclusts <- tibble(k = 1:10) %>%
mutate(
model = map(k, ~ kmeans(x = income_features, centers = .x, nstart = 20)),
glanced = map(model, glance)) %>%
unnest(cols = c(glanced))
# View results
kclusts
## # A tibble: 10 × 6
## k model totss tot.withinss betweenss iter
## <int> <list> <dbl> <dbl> <dbl> <int>
## 1 1 <kmeans> 5.07e14 5.07e14 4.63e 3 1
## 2 2 <kmeans> 5.07e14 2.10e14 2.97e14 1
## 3 3 <kmeans> 5.07e14 1.14e14 3.93e14 2
## 4 4 <kmeans> 5.07e14 7.53e13 4.32e14 2
## 5 5 <kmeans> 5.07e14 5.40e13 4.53e14 2
## 6 6 <kmeans> 5.07e14 3.90e13 4.68e14 3
## 7 7 <kmeans> 5.07e14 2.97e13 4.78e14 4
## 8 8 <kmeans> 5.07e14 2.40e13 4.83e14 4
## 9 9 <kmeans> 5.07e14 1.95e13 4.88e14 3
## 10 10 <kmeans> 5.07e14 1.62e13 4.91e14 5
# Plot Total within-cluster sum of squares (tot.withinss)
kclusts %>%
ggplot(mapping = aes(x = k, y = tot.withinss)) +
geom_line(size = 1.2, alpha = 0.5, color = "dodgerblue3") +
geom_point(size = 2, color = "dodgerblue3")
set.seed(503)
# Fit and predict clusters with k = 3
final_kmeans <- kmeans(income_features, centers =2, nstart = 100, iter.max = 1000)
# Add cluster prediction to the data set
results <- augment(final_kmeans, income_features) %>%
# Bind pca_data - features_2d
bind_cols(features_2d)
results %>%
slice_head(n = 5)
## # A tibble: 5 × 108
## age fnlwgt educati…¹ capit…² capit…³ hours…⁴ workc…⁵ workc…⁶ workc…⁷ workc…⁸
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int>
## 1 25 226802 7 0 0 40 0 0 1 0
## 2 38 89814 9 0 0 50 0 0 1 0
## 3 28 336951 12 0 0 40 0 1 0 0
## 4 44 160323 10 7688 0 40 0 0 1 0
## 5 34 198693 6 0 0 30 0 0 1 0
## # … with 98 more variables: `workclass_Self-emp-not-inc` <int>,
## # `workclass_State-gov` <int>, `workclass_Without-pay` <int>,
## # `education_1st-4th` <int>, `education_5th-6th` <int>,
## # `education_7th-8th` <int>, education_9th <int>, education_10th <int>,
## # education_11th <int>, education_12th <int>, `education_Assoc-acdm` <int>,
## # `education_Assoc-voc` <int>, education_Bachelors <int>,
## # education_Doctorate <int>, `education_HS-grad` <int>, …
# Plot km_cluster assignmnet on the PC data
cluster_plot <- results %>%
ggplot(mapping = aes(x = PC1, y = PC2)) +
geom_point(aes(shape = .cluster), size = 2) +
scale_color_manual(values = c("darkorange","purple","cyan4"))
# Make plot interactive
ggplotly(cluster_plot)
clust_spc_plot <- results %>%
ggplot(mapping = aes(x = PC1, y = PC2)) +
geom_point(aes(shape = .cluster, color = income_above_50k), size = 2, alpha = 0.8) +
scale_color_manual(values = c("darkorange","purple","cyan4"))
# Make plot interactive
ggplotly(clust_spc_plot)
10 pts supervised learning